from pyspark import SparkContext
import plotly.express as px
import pandas as pd
from pyspark.sql import SparkSession
import nltk
sc = SparkContext()
logfile = "file:///opt/spark/spark-3.1.2-bin-hadoop3.2/README.md"
logdata = sc.textFile(logfile).cache()
logdata.count()
108
logdata.first()
'# Apache Spark'
numa = logdata.filter(lambda s: 'a' in s).count()
numb = logdata.filter(lambda s: 'b' in s).count()
print(numa)
64
print(numb)
32
wordCounts = logdata.flatMap(lambda line: line.split()).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a+b)
wordCounts.collect()
[('#', 1),
('Apache', 1),
('Spark', 14),
('is', 7),
('unified', 1),
('analytics', 1),
('engine', 2),
('It', 2),
('provides', 1),
('high-level', 1),
('APIs', 1),
('in', 5),
('Scala,', 1),
('Java,', 1),
('an', 4),
('optimized', 1),
('supports', 2),
('computation', 1),
('analysis.', 1),
('set', 2),
('of', 5),
('tools', 1),
('SQL', 2),
('MLlib', 1),
('machine', 1),
('learning,', 1),
('GraphX', 1),
('graph', 1),
('processing,', 1),
('Structured', 1),
('<https://spark.apache.org/>', 1),
('Build](https://img.shields.io/appveyor/ci/ApacheSoftwareFoundation/spark/master.svg?style=plastic&logo=appveyor)](https://ci.appveyor.com/project/ApacheSoftwareFoundation/spark)',
1),
('Coverage](https://img.shields.io/badge/dynamic/xml.svg?label=pyspark%20coverage&url=https%3A%2F%2Fspark-test.github.io%2Fpyspark-coverage-site&query=%2Fhtml%2Fbody%2Fdiv%5B1%5D%2Fdiv%2Fh1%2Fspan&colorB=brightgreen&style=plastic)](https://spark-test.github.io/pyspark-coverage-site)',
1),
('Documentation', 1),
('latest', 1),
('programming', 1),
('guide,', 1),
('[project', 1),
('page](https://spark.apache.org/documentation.html).', 1),
('README', 1),
('only', 1),
('basic', 1),
('instructions.', 1),
('Building', 1),
('using', 3),
('[Apache', 1),
('run:', 1),
('do', 2),
('this', 1),
('downloaded', 1),
('documentation', 3),
('project', 1),
('site,', 1),
('at', 2),
('Spark"](https://spark.apache.org/docs/latest/building-spark.html).', 1),
('development', 1),
('tips,', 1),
('developing', 1),
('IDE,', 1),
('["Useful', 1),
('Developer', 1),
('Interactive', 2),
('Shell', 2),
('The', 1),
('way', 1),
('start', 1),
('Try', 1),
('following', 2),
('scala>', 1),
('spark.range(1000', 2),
('*', 4),
('1000).count()', 2),
('Python', 2),
('Alternatively,', 1),
('use', 3),
('And', 1),
('run', 7),
('Example', 1),
('several', 1),
('programs', 2),
('them,', 1),
('`./bin/run-example', 1),
('[params]`.', 1),
('example:', 1),
('./bin/run-example', 2),
('SparkPi', 2),
('variable', 1),
('when', 1),
('examples', 2),
('spark://', 1),
('URL,', 1),
('YARN,', 1),
('"local"', 1),
('locally', 2),
('N', 1),
('abbreviated', 1),
('class', 2),
('name', 1),
('package.', 1),
('instance:', 1),
('print', 1),
('usage', 1),
('help', 1),
('no', 1),
('params', 1),
('are', 1),
('Testing', 1),
('Spark](#building-spark).', 1),
('Once', 1),
('built,', 1),
('tests', 2),
('using:', 1),
('./dev/run-tests', 1),
('Please', 4),
('guidance', 2),
('module,', 1),
('individual', 1),
('integration', 1),
('test,', 1),
('Note', 1),
('About', 1),
('uses', 1),
('library', 1),
('HDFS', 1),
('other', 1),
('Hadoop-supported', 1),
('storage', 1),
('systems.', 1),
('Because', 1),
('have', 1),
('changed', 1),
('different', 1),
('versions', 1),
('Hadoop,', 2),
('must', 1),
('against', 1),
('version', 1),
('refer', 2),
('YARN"](https://spark.apache.org/docs/latest/building-spark.html#specifying-the-hadoop-version-and-enabling-yarn)',
1),
('particular', 2),
('distribution', 1),
('Hive', 2),
('Thriftserver', 1),
('distributions.', 1),
('[Configuration', 1),
('online', 1),
('overview', 1),
('configure', 1),
('Spark.', 1),
('Contributing', 1),
('guide](https://spark.apache.org/contributing.html)', 1),
('started', 1),
('contributing', 1),
('project.', 1),
('a', 9),
('for', 12),
('large-scale', 1),
('data', 2),
('processing.', 2),
('Python,', 2),
('and', 9),
('R,', 1),
('that', 2),
('general', 2),
('graphs', 1),
('also', 5),
('rich', 1),
('higher-level', 1),
('including', 4),
('DataFrames,', 1),
('Streaming', 1),
('stream', 1),
('[](https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7-hive-2.3)',
1),
('[.', 1),
('To', 2),
('build', 3),
('its', 1),
('example', 3),
('programs,', 1),
('./build/mvn', 1),
('-DskipTests', 1),
('clean', 1),
('package', 1),
('(You', 1),
('not', 1),
('need', 1),
('to', 16),
('if', 4),
('you', 4),
('pre-built', 1),
('package.)', 1),
('More', 1),
('detailed', 2),
('available', 1),
('from', 1),
('["Building', 1),
('For', 3),
('info', 1),
('see', 3),
('Tools"](https://spark.apache.org/developer-tools.html).', 1),
('Scala', 2),
('easiest', 1),
('through', 1),
('shell:', 2),
('./bin/spark-shell', 1),
('command,', 2),
('which', 2),
('should', 2),
('return', 2),
('1,000,000,000:', 2),
('1000', 2),
('prefer', 1),
('./bin/pyspark', 1),
('>>>', 1),
('Programs', 1),
('comes', 1),
('with', 3),
('sample', 1),
('`examples`', 2),
('directory.', 1),
('one', 2),
('<class>', 1),
('will', 1),
('Pi', 1),
('locally.', 1),
('MASTER', 1),
('environment', 1),
('running', 1),
('submit', 1),
('cluster.', 1),
('be', 2),
('mesos://', 1),
('or', 3),
('"yarn"', 1),
('thread,', 1),
('"local[N]"', 1),
('threads.', 1),
('MASTER=spark://host:7077', 1),
('Many', 1),
('given.', 1),
('Running', 1),
('Tests', 1),
('first', 1),
('requires', 1),
('[building', 1),
('how', 3),
('[run', 1),
('tests](https://spark.apache.org/developer-tools.html#individual-tests).',
1),
('There', 1),
('Kubernetes', 1),
('resource-managers/kubernetes/integration-tests/README.md', 1),
('A', 1),
('Hadoop', 3),
('Versions', 1),
('core', 1),
('talk', 1),
('protocols', 1),
('same', 1),
('your', 1),
('cluster', 1),
('runs.', 1),
('["Specifying', 1),
('Version', 1),
('Enabling', 1),
('building', 2),
('Configuration', 1),
('Guide](https://spark.apache.org/docs/latest/configuration.html)', 1),
('review', 1),
('[Contribution', 1),
('information', 1),
('get', 1)]
book = sc.textFile("book.txt").cache()
"book.txt" is the text file or a digital copy of the book The Hound of the Baskervilles, by Arthur Conan Doyle
book
book.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0
book is now a RDD
llist = book.collect() for line in llist: print(line)
Printing the Elements of the arrys given out by the collect() RDD function
wordCounts_book = book.flatMap(lambda line: line.split()).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a+b)
wordCounts_book.collect()
[('The', 223),
('Project', 79),
('of', 1702),
('Hound', 8),
('Arthur', 2),
('Conan', 4),
('is', 598),
('use', 21),
('anyone', 18),
('anywhere', 5),
('in', 912),
('United', 15),
('States', 8),
('other', 77),
('world', 13),
('at', 346),
('no', 178),
('restrictions', 2),
('whatsoever.', 2),
('may', 111),
('it,', 57),
('give', 37),
('away', 46),
('re-use', 2),
('this', 317),
('online', 4),
('are', 211),
('have', 527),
('check', 6),
('country', 16),
('where', 76),
('before', 78),
('using', 8),
('eBook.', 2),
('Baskervilles', 9),
('Author:', 1),
('Date:', 1),
('October,', 2),
('[eBook', 1),
('27,', 1),
('2021]', 1),
('Language:', 1),
('English', 2),
('Character', 1),
('set', 34),
('UTF-8', 1),
('***', 4),
('OF', 13),
('GUTENBERG', 3),
('HOUND', 3),
('Another', 5),
('A.', 2),
('Robinson,', 1),
('It', 153),
('was', 781),
('account', 9),
('West-Country', 1),
('inception.', 1),
('help', 37),
('details', 6),
('thanks.', 1),
('truly,', 1),
('Hindhead,', 1),
('Haslemere.', 1),
('Contents', 1),
('Chapter', 30),
('1', 1),
('Mr.', 60),
('Curse', 2),
('4', 2),
('Sir', 245),
('Three', 2),
('Merripit', 15),
('House', 7),
('8', 1),
('Report', 4),
('Watson', 15),
('9', 1),
('upon', 315),
('[Second', 2),
('Watson]', 2),
('10', 1),
('Man', 2),
('12', 1),
('Fixing', 2),
('Nets', 2),
('14', 1),
('Retrospection', 2),
('1.', 2),
('very', 178),
('late', 12),
('mornings,', 1),
('save', 13),
('occasions', 1),
('when', 112),
('he', 658),
('seated', 1),
('table.', 4),
('stood', 27),
('hearth-rug', 1),
('picked', 3),
('night', 38),
('thick', 9),
('known', 16),
('as', 386),
('lawyer.”', 1),
('Just', 3),
('head', 22),
('broad', 10),
('silver', 4),
('band', 2),
('an', 199),
('“To', 12),
('his', 658),
('just', 22),
('family', 24),
('used', 14),
('carry—dignified,', 1),
('solid,', 1),
('do', 148),
('make', 48),
('sitting', 6),
('sign', 9),
('occupation.', 1),
('“How', 18),
('know', 97),
('doing?', 2),
('believe', 14),
('eyes', 47),
('“I', 178),
('well-polished,', 1),
('coffee-pot', 1),
('me,”', 3),
('said', 220),
('tell', 60),
('stick?', 1),
('Since', 2),
('we', 346),
('unfortunate', 6),
('notion', 1),
('errand,', 1),
('accidental', 1),
('souvenir', 1),
('importance.', 3),
('Let', 5),
('hear', 23),
('it.”', 38),
('think,”', 1),
('following', 12),
('far', 36),
('companion,', 3),
('“that', 4),
('Mortimer', 40),
('medical', 7),
('well-esteemed', 1),
('mark', 7),
('think', 74),
('deal', 9),
('so?”', 4),
('stick,', 1),
('though', 24),
('originally', 1),
('knocked', 1),
('it.', 71),
('thick-iron', 1),
('ferrule', 1),
('evident', 9),
('amount', 2),
('“And', 49),
('again,', 8),
('there', 175),
('‘friends', 1),
('C.C.H.’', 1),
('guess', 5),
('Something', 4),
('hunt', 3),
('whose', 9),
('members', 1),
('surgical', 1),
('assistance,', 1),
('presentation', 4),
('“Really,', 3),
('pushing', 1),
('chair', 1),
('cigarette.', 2),
('am', 80),
('say', 57),
('accounts', 2),
('good', 40),
('own', 78),
('achievements', 1),
('underrated', 1),
('but', 252),
('genius', 2),
('power', 12),
('stimulating', 2),
('confess,', 2),
('debt.”', 1),
('never', 60),
('before,', 5),
('must', 95),
('admit', 2),
('gave', 28),
('keen', 6),
('admiration', 3),
('publicity', 1),
('methods.', 2),
('way', 48),
('earned', 1),
('approval.', 1),
('now', 52),
('took', 34),
('examined', 5),
('minutes', 7),
('naked', 1),
('eyes.', 13),
('Then', 31),
('expression', 7),
('interest', 19),
('down', 97),
('looked', 47),
('again', 28),
('lens.', 1),
('“Interesting,', 1),
('elementary,”', 1),
('settee.', 1),
('certainly', 26),
('two', 70),
('indications', 4),
('us', 106),
('basis', 3),
('several', 23),
('anything', 35),
('me?”', 5),
('When', 27),
('stimulated', 1),
('fallacies', 1),
('guided', 2),
('towards', 23),
('truth.', 4),
('Not', 8),
('instance.', 1),
('practitioner.', 2),
('And', 62),
('walks', 3),
('“Then', 23),
('right.”', 3),
('“But', 40),
('all.”', 1),
('“No,', 43),
('no,', 13),
('all—by', 1),
('means', 19),
('would', 185),
('suggest,', 1),
('example,', 7),
('doctor', 2),
('more', 122),
('likely', 8),
('hospital', 5),
('than', 91),
('hunt,', 3),
('initials', 6),
('placed', 9),
('‘Charing', 2),
('suggest', 6),
('themselves.”', 1),
('direction.', 4),
('take', 32),
('working', 3),
('hypothesis', 1),
('start', 7),
('unknown', 7),
('then,', 23),
('stand', 6),
('Cross', 4),
('further', 13),
('inferences', 1),
('draw?”', 1),
('“Do', 9),
('none', 9),
('Apply', 1),
('only', 90),
('conclusion', 3),
('farther', 7),
('occasion', 6),
('unite', 2),
('pledge', 2),
('will?', 1),
('service', 5),
('presentation.', 1),
('change', 6),
('stretching', 1),
('_staff_', 1),
('hospital,', 2),
('well-established', 1),
('position,', 2),
('into', 109),
('country.', 6),
('What', 35),
('he,', 24),
('yet', 39),
('staff', 1),
('house-surgeon', 1),
('house-physician—little', 1),
('senior', 1),
('years', 13),
('vanishes', 1),
('air,', 5),
('emerges', 1),
('young', 18),
('thirty,', 1),
('possessor', 1),
('describe', 4),
('larger', 3),
('blew', 1),
('wavering', 1),
('rings', 2),
('latter', 9),
('part,', 2),
('you,”', 5),
('out', 142),
('professional', 5),
('career.”', 1),
('Medical', 2),
('turned', 41),
('visitor.', 3),
('read', 17),
('record', 3),
('James,', 2),
('Grimpen,', 4),
('House-surgeon,', 1),
('Winner', 1),
('Jackson', 1),
('prize', 1),
('Comparative', 1),
('entitled', 1),
('Disease', 1),
('Corresponding', 1),
('Swedish', 1),
('‘Some', 1),
('Atavism’', 1),
('Psychology_,', 1),
('1883).', 1),
('Officer', 1),
('parishes', 1),
('Barrow.”', 1),
('mention', 6),
('Watson,”', 12),
('smile,', 2),
('observed.', 2),
('fairly', 6),
('inferences.', 1),
('right,', 5),
('absent-minded.', 1),
('amiable', 2),
('unambitious', 1),
('abandons', 1),
('career', 2),
('country,', 3),
('leaves', 3),
('after', 48),
('waiting', 13),
('room.”', 1),
('dog?”', 1),
('habit', 4),
('master.', 3),
('heavy', 11),
('tightly', 1),
('middle,', 1),
('teeth', 2),
('visible.', 2),
('space', 6),
('these', 58),
('marks,', 1),
('opinion', 4),
('mastiff.', 1),
('been—yes,', 1),
('spaniel.”', 1),
('paced', 1),
('room', 15),
('Now', 7),
('halted', 5),
('glanced', 9),
('sure', 18),
('“For', 4),
('simple', 7),
('move,', 1),
('brother', 19),
('presence', 8),
('assistance', 3),
('dramatic', 1),
('step', 8),
('stair', 2),
('life,', 5),
('ill.', 1),
('ask', 19),
('in!”', 1),
('appearance', 9),
('typical', 1),
('long', 53),
('nose', 1),
('like', 47),
('jutted', 2),
('grey', 17),
('closely', 5),
('glasses.', 1),
('clad', 1),
('rather', 24),
('fashion,', 3),
('frock-coat', 1),
('dingy', 1),
('trousers', 2),
('Though', 2),
('already', 25),
('walked', 17),
('benevolence.', 1),
('entered', 6),
('ran', 19),
('Office.', 1),
('lose', 5),
('world.”', 1),
('“A', 20),
('presentation,', 1),
('“Yes,', 43),
('sir.”', 11),
('Hospital?”', 1),
('shaking', 3),
('blinked', 1),
('astonishment.', 1),
('bad?”', 1),
('deductions.', 1),
('say?”', 2),
('married,', 1),
('consulting', 2),
('home', 12),
('own.”', 1),
('“Come,', 3),
('come,', 4),
('all,”', 1),
('Mortimer—”', 1),
('“Mister,', 1),
('sir,', 53),
('Mister—a', 1),
('evidently.”', 1),
('dabbler', 1),
('picker', 1),
('shores', 1),
('ocean.', 1),
('not—”', 1),
('“Glad', 1),
('meet', 12),
('heard', 63),
('name', 27),
('connection', 11),
('friend.', 4),
('skull', 7),
('well-marked', 1),
('development.', 1),
('finger', 1),
('fissure?', 1),
('cast', 6),
('until', 56),
('ornament', 1),
('anthropological', 1),
('museum.', 1),
('intention', 3),
('fulsome,', 1),
('confess', 8),
('covet', 1),
('skull.”', 1),
('strange', 22),
('line', 8),
('thought,', 2),
('mine,”', 1),
('tobacco', 3),
('surprising', 3),
('quivering', 3),
('fingers', 5),
('agile', 1),
('insect.', 1),
('silent,', 2),
('darting', 1),
('glances', 1),
('showed', 20),
('curious', 11),
('companion.', 4),
('sir,”', 6),
('honour', 5),
('call', 16),
('last', 61),
('opportunity', 8),
('came', 66),
('myself', 21),
('unpractical', 1),
('suddenly', 11),
('Recognizing,', 1),
('do,', 5),
('expert', 5),
('Europe—”', 1),
('sir!', 2),
('May', 3),
('first?”', 1),
('asperity.', 1),
('precisely', 2),
('scientific', 6),
('mind', 24),
('work', 49),
('always', 28),
('appeal', 1),
('him?”', 8),
('But', 87),
('trust,', 1),
('“Just', 4),
('wisely', 1),
('ado', 1),
('demand', 2),
('manuscript,”', 1),
('room,”', 1),
('“Early', 1),
('that,', 11),
('presented', 3),
('talking.', 1),
('poor', 21),
('decade', 1),
('so.', 11),
('subject.', 2),
('put', 32),
('1730.”', 1),
('breast-pocket.', 1),
('“This', 7),
('Baskerville,', 20),
('three', 22),
('months', 6),
('ago', 3),
('created', 3),
('excitement', 4),
('strong-minded', 1),
('shrewd,', 1),
('unimaginative', 1),
('prepared', 6),
('end', 24),
('eventually', 2),
('overtake', 4),
('him.”', 8),
('flattened', 1),
('alternative', 3),
('_s_', 1),
('fix', 3),
('shoulder', 5),
('yellow', 10),
('At', 26),
('below', 1),
('large,', 3),
('scrawling', 1),
('figures:', 1),
('“1742.”', 1),
('appears', 7),
('statement', 5),
('sort.”', 1),
('certain', 15),
('family.”', 1),
('understand', 21),
('something', 36),
('modern', 8),
('“Most', 3),
('matter,', 5),
('decided', 2),
('hours.', 2),
('short', 10),
('you.”', 12),
('chair,', 4),
('together,', 10),
('high,', 3),
('curious,', 2),
('origin', 1),
('statements,', 1),
('story', 16),
('belief', 2),
('occurred', 8),
('even', 22),
('Justice', 1),
('forgive', 4),
('ban', 1),
('repentance', 1),
('Learn', 1),
('fear', 24),
('future,', 2),
('passions', 1),
('whereby', 1),
('grievously', 1),
('undoing.', 1),
('Great', 2),
('(the', 1),
('Lord', 1),
('earnestly', 2),
('commend', 2),
('nor', 8),
('gainsaid', 1),
('profane,', 1),
('man.', 17),
('This,', 3),
('truth,', 7),
('neighbours', 5),
('pardoned,', 1),
('seeing', 6),
('humour', 1),
('chanced', 4),
('love', 11),
('(if,', 1),
('indeed,', 5),
('dark', 33),
('passion', 4),
('bright', 4),
('daughter', 1),
('yeoman', 1),
('near', 17),
('maiden,', 3),
('repute,', 1),
('ever', 34),
('feared', 3),
('evil', 11),
('pass', 7),
('companions,', 2),
('carried', 5),
('her', 121),
('father', 5),
('knew.', 3),
('maiden', 1),
('chamber,', 1),
('sat', 20),
('carouse,', 1),
('nightly', 1),
('custom.', 2),
('wits', 4),
('singing', 1),
('shouting', 1),
('terrible', 11),
('oaths', 1),
('below,', 1),
('blast', 1),
('them.', 21),
('stress', 1),
('active', 7),
('growth', 1),
('covered', 6),
('(and', 2),
('covers)', 1),
('eaves,', 1),
('homeward', 1),
('betwixt', 1),
('guests', 1),
('carry', 10),
('drink—with', 1),
('worse', 3),
('things,', 4),
('perchance—to', 1),
('bird', 1),
('seem,', 1),
('dining-hall,', 1),
('flagons', 1),
('flying', 2),
('render', 1),
('Powers', 1),
('wench.', 1),
('revellers', 3),
('drunken', 2),
('rest,', 2),
('hounds', 3),
('crying', 2),
('saddle', 1),
('mare', 2),
('kerchief', 1),
('maid’s,', 1),
('swung', 3),
('them', 54),
('line,', 3),
('full', 24),
('cry', 21),
('moonlight', 2),
('moor.', 51),
('haste.', 1),
('bemused', 1),
('deed', 1),
('moorlands.', 1),
('uproar,', 1),
('horses,', 2),
('length', 5),
('crazed', 3),
('minds,', 2),
('them,', 16),
('thirteen', 1),
('number,', 2),
('started', 9),
('pursuit.', 3),
('clear', 27),
('above', 8),
('swiftly', 11),
('abreast,', 1),
('maid', 3),
('needs', 3),
('home.', 1),
('mile', 4),
('passed', 26),
('seen', 37),
('goes,', 2),
('scarce', 1),
('indeed', 16),
('unhappy', 6),
('mare,', 2),
('mute', 1),
('God', 3),
('skins', 1),
('dabbled', 1),
('past', 7),
('close', 12),
('each,', 1),
('alone,', 1),
('right', 27),
('horse’s', 1),
('Riding', 1),
('hounds.', 1),
('These,', 1),
('whimpering', 1),
('some,', 1),
('starting', 3),
('valley', 3),
('men,', 3),
('advance,', 1),
('drunken,', 1),
('goyal.', 1),
('opened', 14),
('there,', 9),
('days', 19),
('shining', 4),
('her,', 13),
('raised', 6),
('hair', 4),
('dare-devil', 1),
('plucking', 1),
('shaped', 1),
('hound,', 14),
('mortal', 3),
('upon.', 1),
('throat', 6),
('blazing', 5),
('screaming,', 1),
('One,', 1),
('twain', 1),
('rest', 7),
('tale,', 1),
('sorely', 1),
('terror', 5),
('hinted', 1),
('guessed.', 1),
('deaths,', 1),
('sudden,', 1),
('mysterious.', 1),
('shelter', 1),
('ourselves', 11),
('infinite', 3),
('goodness', 1),
('Providence,', 2),
('punish', 1),
('innocent', 3),
('beyond', 8),
('third', 2),
('fourth', 2),
('generation', 1),
('threatened', 2),
('Holy', 1),
('caution', 1),
('forbear', 1),
('crossing', 1),
('moor', 49),
('hours', 10),
('exalted.', 1),
('“[This', 1),
('John,', 3),
('instructions', 3),
('thereof', 1),
('sister', 8),
('Elizabeth.]”', 1),
('singular', 11),
('narrative', 6),
('pushed', 2),
('stared', 11),
('tossed', 1),
('“Well?”', 3),
('fairy', 1),
('newspaper', 2),
('pocket.', 4),
('County', 2),
('Chronicle_', 1),
('14th', 1),
('intent.', 1),
('readjusted', 1),
('began:', 1),
('Mid-Devon', 1),
('election,', 1),
('gloom', 5),
('county.', 2),
('resided', 1),
('period', 2),
('amiability', 1),
('extreme', 2),
('_nouveaux', 1),
('riches_', 1),
('able', 25),
('bring', 10),
('grandeur', 2),
('line.', 2),
('Charles,', 11),
('large', 10),
('sums', 2),
('money', 8),
('African', 2),
('go', 48),
('turns', 1),
('against', 47),
('gains', 1),
('common', 5),
('reconstruction', 1),
('improvement', 1),
('death.', 6),
('childless,', 1),
('openly', 2),
('should,', 1),
('profit', 2),
('bewailing', 1),
('untimely', 1),
('charities', 2),
('chronicled', 1),
('columns.', 2),
('circumstances', 4),
('cannot', 37),
('rumours', 2),
('play,', 1),
('wealth', 2),
('tastes,', 1),
('consisted', 1),
('married', 6),
('named', 3),
('husband', 12),
('acting', 3),
('housekeeper.', 1),
('evidence,', 3),
('corroborated', 2),
('tends', 1),
('points', 8),
('especially', 6),
('manifesting', 1),
('acute', 2),
('attendant', 1),
('evidence', 9),
('simple.', 1),
('alley', 7),
('Barrymores', 3),
('Barrymore', 40),
('prepare', 3),
('That', 33),
('usual', 2),
('walk,', 4),
('twelve', 3),
('open,', 4),
('lantern,', 1),
('search', 6),
('footmarks', 2),
('easily', 8),
('traced', 2),
('walk', 17),
('gate', 11),
('here.', 8),
('proceeded', 2),
('discovered.', 1),
('One', 16),
('master’s', 3),
('footprints', 3),
('altered', 1),
('onward', 4),
('toes.', 1),
('horse-dealer,', 1),
('distance', 10),
('confession', 1),
('cries', 5),
('state', 8),
('direction', 15),
('signs', 7),
('person,', 3),
('pointed', 6),
('incredible', 3),
('distortion—so', 1),
('refused', 6),
('symptom', 1),
('unusual', 1),
('cases', 10),
('cardiac', 1),
('exhaustion.', 2),
('borne', 2),
('post-mortem', 1),
('examination,', 1),
('long-standing', 1),
('organic', 1),
('jury', 1),
('verdict', 1),
('evidence.', 2),
('heir', 9),
('settle', 2),
('interrupted.', 1),
('Had', 2),
('prosaic', 2),
('romantic', 1),
('affair,', 2),
('tenant', 4),
('understood', 4),
('kin', 2),
('alive,', 2),
('son', 4),
('brother.', 4),
('America,', 4),
('instituted', 2),
('replaced', 1),
('“Those', 2),
('public', 13),
('presents', 2),
('interest.', 4),
('preoccupied', 2),
('Vatican', 1),
('oblige', 1),
('Pope', 1),
('lost', 10),
('touch', 4),
('interesting', 10),
('facts?”', 2),
('let', 15),
('private', 9),
('back,', 4),
('assumed', 1),
('anyone.', 4),
('inquiry', 5),
('science', 2),
('shrinks', 1),
('placing', 1),
('position', 12),
('indorse', 1),
('superstition.', 2),
('says,', 1),
('untenanted', 1),
('increase', 1),
('grim', 8),
('reputation.', 1),
('both', 25),
('thought', 35),
('result', 3),
('why', 29),
('perfectly', 4),
('frank.', 1),
('inhabited,', 2),
('live', 7),
('exception', 1),
('Lafter', 5),
('Stapleton,', 15),
('naturalist,', 2),
('chance', 24),
('community', 1),
('interests', 1),
('kept', 18),
('charming', 2),
('discussing', 1),
('comparative', 1),
('anatomy', 1),
('Bushman', 1),
('Hottentot.', 1),
('“Within', 1),
('increasingly', 1),
('heart—so', 1),
('although', 2),
('grounds,', 1),
('night.', 13),
('Incredible', 1),
('appear', 2),
('convinced', 4),
('dreadful', 8),
('overhung', 1),
('family,', 6),
('ancestors', 1),
('encouraging.', 1),
('idea', 11),
('constantly', 1),
('haunted', 1),
('journeys', 1),
('creature', 15),
('baying', 2),
('hound.', 7),
('question', 7),
('times,', 1),
('vibrated', 1),
('excitement.', 1),
('house', 25),
('weeks', 3),
...]
Frequency of words Before Processing them and using the RDD Function as given in the Exaample of this Lab
book.take(5)
['The Project Gutenberg eBook of The Hound of the Baskervilles, by Arthur Conan Doyle', '', 'This eBook is for the use of anyone anywhere in the United States and', 'most other parts of the world at no cost and with almost no restrictions', 'whatsoever. You may copy it, give it away or re-use it under the terms']
def func(lines):
lines = lines.lower()
lines = lines.split()
return lines
rdd1 = book.map(func)
Spark map() transformation applies a function to each row in the RDD
rdd1.take(5)
[['the', 'project', 'gutenberg', 'ebook', 'of', 'the', 'hound', 'of', 'the', 'baskervilles,', 'by', 'arthur', 'conan', 'doyle'], [], ['this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'united', 'states', 'and'], ['most', 'other', 'parts', 'of', 'the', 'world', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions'], ['whatsoever.', 'you', 'may', 'copy', 'it,', 'give', 'it', 'away', 'or', 're-use', 'it', 'under', 'the', 'terms']]
rdd2 = book.flatMap(func)
rdd2.take(5)
['the', 'project', 'gutenberg', 'ebook', 'of']
Spark flatMap() transformation flattens the DataFrame/Dataset after applying the function on every element and returns a new transformed Dataset
stop_words = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'all', 'just',
'being', 'over', 'both', 'through', 'yourselves', 'its', 'before', 'herself', 'had', 'should', 'to',
'only', 'under', 'ours', 'has', 'do', 'them', 'his', 'very', 'they', 'not', 'during', 'now', 'him',
'nor', 'did', 'this', 'she', 'each', 'further', 'where', 'few', 'because', 'doing', 'some', 'are',
'our', 'ourselves', 'out', 'what', 'for', 'while', 'does', 'above', 'between', 'be', 'we', 'who',
'were', 'here', 'hers', 'by', 'on', 'about', 'of', 'against', 'or', 'own', 'into', 'yourself', 'down',
'your', 'from', 'her', 'their', 'there', 'been', 'whom', 'too', 'themselves', 'was', 'until', 'more',
'himself', 'that', 'but', 'don', 'with', 'than', 'those', 'he', 'me', 'myself', 'these', 'up', 'will',
'below', 'can', 'theirs', 'my', 'and', 'then', 'is', 'am', 'it', 'an', 'as', 'itself', 'at', 'have',
'in', 'any', 'if', 'again', 'no', 'when', 'same', 'how', 'other', 'which', 'you', 'after', 'most',
'such', 'why', 'off', 'yours', 'so', 'the', 'having', 'once', 'jobs', 'job', 'amp', 'im']
common stop Words in English , English Stop-word list referred from "https://gist.githubusercontent.com/sebleier/5542/raw/7e0e4a1ce04c2bb7bd41089c9821dbcf6d0c786c/NLTK's%2520list%2520of%2520english%2520stopwords" and added a few more words by inspecting data.
rdd3 = rdd2.filter(lambda x: x not in stop_words) #rdd3 is book without stop words
rdd2.take(10)
['the', 'project', 'gutenberg', 'ebook', 'of', 'the', 'hound', 'of', 'the', 'baskervilles,']
Removing the stop words from the RDD by usuing the filter RDD function
rdd3.take(10)
['project', 'gutenberg', 'ebook', 'hound', 'baskervilles,', 'arthur', 'conan', 'doyle', 'ebook', 'use']
As seen above we can observe that the stop words have been removed; rdd3 does not have words 'the' , 'of' ...
rdd4 = rdd3.groupBy(lambda w: w[0:2])
print ([(k , list(v)) for (k,v) in rdd4.take(1)])
[('pr', ['project', 'project', 'produced', 'project', 'problem', 'practitioner', 'probability', 'practitioner', 'practitioner', 'presentation', 'proud,', 'practitioner.', 'presentation', 'probability', 'practised', 'probable', 'presentation', 'practice', 'presentation.', 'practice.', 'presentation', 'probable.”', 'practice', 'practitioner', 'professional', 'prize', 'progress?’', 'professional', 'presence', 'practitioner.', 'professional', 'presentation,', 'practice.', 'precise', 'presume', 'presume,', 'problem.', 'precisely', 'precisely', 'practical', 'problem', 'presented', 'practical,', 'prepared', 'practical', 'practical,', 'pressing', 'prayer', 'profane,', 'providence,', 'providence,', 'probable', 'profit', 'prepare', 'proceeded', 'prosaic', 'presents', 'preoccupied', 'private', 'practical', 'presence', 'prints', 'problem', 'presented', 'presume?”', 'prosperity', 'presence.', 'probably', 'proceed', 'problem', 'pretty', 'presume?”', 'prison', 'princetown.', 'problem.', 'presuming', 'probable,', 'practical', 'promised', 'proposed', 'printed', 'preceding', 'printed', 'printed', 'promise', 'present', 'protective', 'professional', 'presume,', 'print', 'probability', 'print.', 'printed', 'proper', 'probabilities', 'private', 'promise', 'presented', 'problem', 'presents', 'practical', 'prevent', 'prefer', 'proceeding', 'probability', 'promise', 'profess', 'private', 'prevent', 'profit', 'provisions', 'pressed', 'present', 'property?', 'provision', 'practice', 'present', 'practice', 'present', 'proposition', 'promise', 'professed', 'printed', 'proved', 'pretty', 'prettily', 'preserve', 'precautions.”', 'presume?”', 'prosaic', 'pride,', 'princetown,', 'projected.', 'probably', 'present.', 'propitious', 'proved', 'promised', 'proof', 'prayed,', 'preoccupations', 'prim-faced', 'presumption,', 'pretend', 'present.', 'propose', 'promise', 'prey.', 'prehistoric', 'progress', 'proud,', 'prosperous', 'privilege', 'propose', 'pressure', 'promise', 'previous', 'pretty', 'prehistoric', 'presence', 'probably', 'practical', 'prevent', 'prehistoric', 'prosecute', 'present,', 'probably', 'present,', 'prosecute', 'proof', 'profile', 'prepared', 'pretty', 'prepared', 'pretext', 'protest.', 'prairie', 'private,”', 'prevent', 'protest', 'private', 'prepare', 'promise', 'promised,', 'precaution', 'preoccupied', 'pressed', 'prison,', 'price', 'prevent', 'private', 'presence', 'proof', 'princetown', 'prisoner.', 'progress.', 'proceed,', 'present', 'protecting', 'pretty', 'provided', 'prehistoric', 'proved', 'pretty', 'pressed', 'precious', 'problem', 'private', 'prickings', 'presence', 'preferred', 'prevent', 'private', 'pressing', 'pressing', 'private', 'prospect', 'proceedings', 'probability', 'proceed', 'property,', 'proud', 'protection', 'present', 'prowling', 'preserved', 'presume?”', 'presence', 'pride', 'practised', 'praise', 'probable', 'profession.', 'projected', 'prolonged', 'prostrate', 'prove', 'prove.', 'precipitous', 'probability—and', 'presuming', 'presume', 'princetown.”', 'prove', 'prove', 'prove', 'present,', 'propose', 'promise.', 'problem', 'pretend', 'presume?”', 'prim,', 'princetown', 'promise', 'present', 'programme,', 'professionals,', 'pressure', 'prove', 'prepared', 'private', 'preserve', 'presume', 'precipice.', 'probable', 'practical', 'professional', 'presence', 'profiles', 'professional.', 'pressed', 'press', 'prayer', 'preparation', 'prepared', 'propose', 'preparations', 'projecting.', 'printed', 'present', 'problem', 'proved', 'prepared', 'proceeded', 'proved', 'pretended', 'pressure', 'prevented', 'probably', 'print', 'private', 'promptness', 'procured', 'proved', 'probable,', 'printed', 'prompt', 'presence', 'proving', 'presented,', 'predict', 'property?', 'present', 'problem', 'property', 'proofs', 'proportion', 'project', 'previous', 'print', 'protected', 'project', 'protect', 'project', 'project', 'project', 'project', 'printed', 'practically', 'protected', 'project', 'protect', 'project', 'promoting', 'project', 'project', 'project', 'property', 'project', 'project', 'project', 'project', 'preserve', 'project', 'project', 'project', 'prevent', 'project', 'project', 'promoting', 'project', 'project', 'project', 'project', 'project', 'project', 'prominently', 'project', 'project', 'project', 'protected', 'providing', 'project', 'project', 'project', 'project', 'project', 'prominently', 'project', 'proprietary', 'processing', 'provide', 'project', 'project', 'provide', 'project', 'project', 'providing', 'project', 'provided', 'profits', 'project', 'project', 'project', 'prepare', 'prepare)', 'project', 'project', 'provide', 'project', 'project', 'provide,', 'project', 'project', 'project', 'project', 'project', 'proofread', 'protected', 'project', 'project', 'property', 'project', 'project', 'project', 'provided', 'provided', 'provide', 'providing', 'problem.', 'provided', 'provision', 'provisions.', 'providing', 'project', 'production,', 'promotion', 'project', 'project', 'project', 'project', 'project', 'provide', 'project', 'project', 'project', 'provide', 'project', 'project', 'project', 'project', 'project', 'project', 'project', 'prohibition', 'project', 'project', 'professor', 'project', 'produced', 'project', 'project', 'printed', 'protected', 'project', 'project', 'produce'])]
using RDD Function to group words having same two letters at the start
z = rdd3.filter(lambda x: x == "hound").collect()
len(z)
44
using RDD functions to collect the word hound and checking its frequency
rdd3_mapped = rdd3.map(lambda x: (x,1))
rdd3_grouped = rdd3_mapped.groupByKey()
freq_of_words = rdd3_grouped.mapValues(sum).map(lambda x: (x[1],x[0])).sortByKey(False)
freq_of_words.take(20)
[(315, 'upon'), (245, 'sir'), (220, 'said'), (207, 'one'), (198, 'could'), (187, 'would'), (178, '“i'), (149, 'man'), (114, 'may'), (106, 'us'), (105, 'dr.'), (102, 'see'), (97, 'know'), (95, 'must'), (89, 'henry'), (88, 'holmes'), (84, 'might'), (83, 'project'), (74, 'think'), (73, 'baskerville')]
Making the bag of words (word frequency) usuing the mapreduce functions
from pyspark import SparkContext
hasattr(freq_of_words,"toDF")
False
spark = SparkSession(sc)
column_names = ['Frequency', 'Words']
df = freq_of_words.toDF(column_names)
df.printSchema()
df.show(truncate=False)
root |-- Frequency: long (nullable = true) |-- Words: string (nullable = true) +---------+-----------+ |Frequency|Words | +---------+-----------+ |315 |upon | |245 |sir | |220 |said | |207 |one | |198 |could | |187 |would | |178 |“i | |149 |man | |114 |may | |106 |us | |105 |dr. | |102 |see | |97 |know | |95 |must | |89 |henry | |88 |holmes | |84 |might | |83 |project | |74 |think | |73 |baskerville| +---------+-----------+ only showing top 20 rows
Creating a spark dataframe using the frequency of words Rdd
df_freq = pd.DataFrame(freq_of_words.take(20),columns=['Frequency','Words'])
df_freq.head()
| Frequency | Words | |
|---|---|---|
| 0 | 315 | upon |
| 1 | 245 | sir |
| 2 | 220 | said |
| 3 | 207 | one |
| 4 | 198 | could |
Pandas data frame for the frequency of Top 20 words in the corpus without the stop words
figure = px.bar(df_freq,x="Frequency",y="Words")
figure.show()
Bar chart of the Top 20 words in the corpus , from the above chart we can obserse that 'upon' is the most common word used in the corpus next to that is the word 'sir'
fig = px.scatter(df_freq, y="Frequency",x="Words")
fig.show()
The same can be observed in the above scatter plot that "upon" is the most common word in the corpus.
freq_of_words.count()
9284
df_freq_words = pd.DataFrame(freq_of_words.take(9284),columns=['Frequency','Words'])
df_freq_words.head()
| Frequency | Words | |
|---|---|---|
| 0 | 315 | upon |
| 1 | 245 | sir |
| 2 | 220 | said |
| 3 | 207 | one |
| 4 | 198 | could |
df_freq_words.columns
Index(['Frequency', 'Words'], dtype='object')
df_freq_words['pos'] = df_freq_words['Words'].apply(lambda x: nltk.pos_tag([x])[0][1])
df_freq_words.columns
Index(['Frequency', 'Words', 'pos'], dtype='object')
df_freq_words.head()
| Frequency | Words | pos | |
|---|---|---|---|
| 0 | 315 | upon | IN |
| 1 | 245 | sir | NN |
| 2 | 220 | said | VBD |
| 3 | 207 | one | CD |
| 4 | 198 | could | MD |
df_pos_counts = df_freq_words['pos'].value_counts().rename_axis('pos').reset_index(name='counts')
df_pos_counts.head()
| pos | counts | |
|---|---|---|
| 0 | NN | 6690 |
| 1 | NNS | 608 |
| 2 | VBN | 564 |
| 3 | JJ | 498 |
| 4 | VBG | 361 |
figure2 = px.bar(df_pos_counts,x="pos",y="counts")
figure2.show()
!pip3 install
Commom Noun (NN) has the highest count in the corpus and prural common noun(NNS) is the second highest
Sources: